db_1 <- 
  "../data/202001_202212_02_mx_youtube_data.db"

db_2 <- 
  "../data/202301_202312_02_mx_youtube_data.db"

db_3 <- 
  "../data/202401_202405_02_mx_youtube_data.db"

db_4 <-
  "../data/202405_02_mx_youtube_data.db"

library(DBI)
library(RSQLite)

comments_2020_24 <- 
  data.frame()

videos_2020_24 <-
  data.frame()


for (db in c(db_1, db_2, db_3, db_4)) {
  
  print(db)
  
  # Connect to database
  con <- dbConnect(RSQLite::SQLite(), db)
  
  # Read comments
  comments <- dbGetQuery(con, 
                         "SELECT comment_id, text_original FROM comments")
  
  comments_2020_24 <- 
    rbind(comments_2020_24, 
          comments)
  
  # Close connection
  dbDisconnect(con)
  
}

comments_score <- 
  readRDS("~/Downloads/tmp2/comments_score.rds")

comments_score_zero <-
  comments_score %>%
  dplyr::filter(scalar_sum == 0)

comments_score_not_zero <- 
  comments_score %>%
  dplyr::filter(scalar_sum > 0) %>%
  mutate(
    decile = cut(
      log(scalar_sum), 
      breaks = 9,  # Creates 10 equal-width bins
      labels = 2:10,
      include.lowest = TRUE
    )
  )

table(comments_score_not_zero$decile)

comments_score_not_zero$decile[
  comments_score_not_zero$decile %in% c("8","9","10")
] <- "7"

set.seed(28100)
comments_score_sample <-
  dplyr::bind_rows(
    comments_score_zero %>%
      sample_n(100) %>%
      dplyr::mutate(decile = as.character(decile)),
    comments_score_not_zero %>%
      group_by(decile) %>%
      slice_sample(n = 100, replace = FALSE) %>%
      ungroup() %>%
      dplyr::mutate(decile = as.character(decile))
  )

comments_score_sample$text_original <-
  comments_2020_24$text_original[
    match(comments_score_sample$comment_id, 
          comments_2020_24$comment_id)
  ]

#### Split and save ####

# Select only the required columns
comments_subset <- comments_score_sample %>%
  select(comment_id, text_original)

# Calculate the size of each split
n_total <- nrow(comments_subset)
n_per_file <- ceiling(n_total / 10)

# Split into 10 dataframes and save
for(i in 1:10) {
  # Calculate start and end indices
  start_idx <- (i - 1) * n_per_file + 1
  end_idx <- min(i * n_per_file, n_total)
  
  # Subset the data
  subset_df <- comments_subset[start_idx:end_idx, ]
  
  # Create filename
  filename <- sprintf("comments_batch_%02d.csv", i)
  
  # Save to CSV
  write.csv(subset_df, filename, row.names = FALSE, fileEncoding = "UTF-8")
  
  cat(sprintf("Saved %s: %d rows (indices %d to %d)\n", 
              filename, nrow(subset_df), start_idx, end_idx))
}

cat(sprintf("\nTotal rows: %d\n", n_total))
cat(sprintf("Rows per file: ~%d\n", n_per_file))
